In [1]:
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
import cufflinks as cf


import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()

import seaborn as sns

from matplotlib import pyplot as plt
from matplotlib.ticker import EngFormatter
In [2]:
# MySQL-connector-python
engine = create_engine('mysql+mysqlconnector://<user>:<password>@igor.gold.ac.uk/<databasename>')
connection = engine.connect()

Q1 Have the amount of races per year increased?

In [3]:
df = pd.read_sql_query("""SELECT year, COUNT(*) FROM races GROUP BY year;""", engine)
In [4]:
df.rename(columns={'COUNT(*)':'numofraces',},inplace=True)
In [5]:
df.head()
Out[5]:
year numofraces
0 1950 7
1 1951 8
2 1952 8
3 1953 9
4 1954 9
In [6]:
df.dtypes
Out[6]:
year          int64
numofraces    int64
dtype: object
In [7]:
df.set_index(['year'],inplace=True)
In [8]:
#Create scatter plot combined with bar plot
trace1 = go.Scatter(y=df['numofraces'],
                    x=df.index,
                    name='Teams',
                    mode = 'lines+markers',
                    fill='tonexty')


#Establish data to be used in plot
data = [trace1]

#Set layout configuration, title, xaxis, yaxis, range for yvalues
layout = go.Layout(
    title='Races per Championship',
        autosize=False,
        width=850,
        height=350,
    xaxis=dict(
        title='Year'),
    yaxis=dict(
        title='Number',
        autorange=True,
#        range=[0,120]),

)
)

#create plot figure
fig = go.Figure(data=data, layout=layout)

#show plot
plotly.offline.iplot(fig)

# Summary Stats
print(df.describe())
print('median ',(df['numofraces'].median()))
       numofraces
count   69.000000
mean    14.449275
std      3.863617
min      7.000000
25%     11.000000
50%     16.000000
75%     17.000000
max     21.000000
median  16.0

Drivers per team


In [9]:
df2=pd.read_sql_query("""SELECT races.year, 
                        races.raceId, 
                        results.constructorId 
                        FROM results 
                        INNER JOIN races ON races.raceId = results.raceId
                        """, engine)
df2.head()
Out[9]:
year raceId constructorId
0 2008 18 1
1 2008 18 2
2 2008 18 3
3 2008 18 4
4 2008 18 1
In [10]:
df2.columns
Out[10]:
Index(['year', 'raceId', 'constructorId'], dtype='object')
In [11]:
df2 = df2.groupby('year')['constructorId'].nunique()

Q2 Have the number of drivers per team increased?

In [12]:
df3=pd.read_sql_query("""SELECT races.year, 
                        races.raceId, 
                        results.driverId 
                        FROM results 
                        INNER JOIN races ON races.raceId = results.raceId
                        """, engine)
df3.head()
Out[12]:
year raceId driverId
0 2008 18 1
1 2008 18 2
2 2008 18 3
3 2008 18 4
4 2008 18 5
In [13]:
df3.columns
Out[13]:
Index(['year', 'raceId', 'driverId'], dtype='object')
In [14]:
df3 = df3.groupby('year')['driverId'].nunique()
In [15]:
df2 = pd.DataFrame(df2)
df3 = pd.DataFrame(df3)
In [16]:
df4 = df2.join(df3, how='outer')
In [17]:
df4_2 = df2.join(df3, how='outer')
df4_2.head()
Out[17]:
constructorId driverId
year
1950 24 81
1951 24 84
1952 25 105
1953 21 108
1954 19 97
In [18]:
df4['driverperteam'] = df4['driverId']/df4['constructorId']
In [19]:
df4.head()
Out[19]:
constructorId driverId driverperteam
year
1950 24 81 3.375000
1951 24 84 3.500000
1952 25 105 4.200000
1953 21 108 5.142857
1954 19 97 5.105263
In [20]:
df4_2.rename(columns={'constructorId':'Teams','driverId':'Drivers'},inplace=True)
In [21]:
df_4_2_Stats=df4_2[['Teams','Drivers']].describe()
In [22]:
df_4_2_Stats['Driver/team'] = df4['driverperteam'].describe()
In [23]:
#Create scatter plot combined with bar plot
trace1 = go.Scatter(y=df4_2['Teams'],name='Teams',x=df4_2.index,mode = 'markers')
trace2 = go.Scatter(y=df4_2['Drivers'],name='Drivers',x=df4_2.index)
trace3 = go.Bar(y=df4['driverperteam'],name='Drivers per Team',yaxis='y2',opacity=0.3,x=df4.index,)

#Establish data to be used in plot
data = [trace3,trace1,trace2]

#Set layout configuration, title, xaxis, yaxis, specific range for yvalues
layout = go.Layout(
    title='Relationship of number and drivers and teams per Year',
    legend=dict(orientation="h",x=-.1, y=1.2),
        autosize=False,
        width=850,
        height=350,
    xaxis=dict(
        title='Year'),
    yaxis=dict(
        title='Number',
        autorange=False,
        range=[0,120]),
    yaxis2=dict(
        title='Drivers Per team',
        autorange=False,
        range=[0,12],
        titlefont=dict(color='rgb(0, 0, 0)'),
        tickfont=dict(color='rgb(0, 0, 0)'),
        showgrid=False,
        overlaying='y',
        side='right')
)

#create plot figure
fig = go.Figure(data=data, layout=layout)

#show plot
plotly.offline.iplot(fig)
print(df_4_2_Stats)
           Teams     Drivers  Driver/team
count  68.000000   68.000000    68.000000
mean   15.308824   44.985294     2.860233
std     4.368330   22.899190     0.946750
min    10.000000   22.000000     1.894737
25%    11.000000   26.750000     2.196875
50%    14.500000   38.000000     2.460526
75%    19.000000   54.000000     3.308036
max    26.000000  108.000000     5.666667
In [24]:
del df4['constructorId']
del df4['driverId']

Q3 Which are the fastest tracks by average lap speed performed by winners?

In [25]:
df5=pd.read_sql_query("""SELECT races.year,
                      races.name,
                      results.fastestLapSpeed
                      FROM results
                      INNER JOIN races ON races.raceId = results.raceId
                      WHERE results.position = 1
                      """, engine)
df5.head()
Out[25]:
year name fastestLapSpeed
0 2008 Australian Grand Prix 218.300
1 2008 Malaysian Grand Prix 209.158
2 2008 Bahrain Grand Prix 208.153
3 2008 Spanish Grand Prix 205.191
4 2008 Turkish Grand Prix 221.734
In [26]:
df5['fastestLapSpeed'] = pd.to_numeric(df5['fastestLapSpeed'],errors="coerce")
In [27]:
df5['name'] = df5['name'].astype('category')
In [28]:
df5.dtypes
Out[28]:
year                  int64
name               category
fastestLapSpeed     float64
dtype: object
In [29]:
table = pd.pivot_table(df5,index =['name'] ,columns=['year'])
In [30]:
table_to_plot = table.T
In [31]:
box_speed = table_to_plot.dropna(axis=1, how='all')
In [32]:
box_speed = box_speed.swaplevel(0, 1, axis=0)
In [33]:
box_speed.rename(columns={'Abu Dhabi Grand Prix':'Abu_Dhabi','Australian Grand Prix':'Australia'
                         ,'Austrian Grand Prix':'Austria','Azerbaijan Grand Prix':'Azerbaijan'
                         ,'Bahrain Grand Prix':'Bahrain','Belgian Grand Prix':'Belgium'
                         ,'Brazilian Grand Prix':'Brazil','British Grand Prix':'UK'
                         ,'Canadian Grand Prix':'Canada','Chinese Grand Prix':'China'
                         ,'European Grand Prix':'Europe','French Grand Prix':'France'
                         ,'German Grand Prix':'Germany','Hungarian Grand Prix':'Hungary'
                         ,'Indian Grand Prix':'India','Italian Grand Prix':'Italy'
                         ,'Japanese Grand Prix':'Japan','Korean Grand Prix':'Korea'
                         ,'Malaysian Grand Prix':'Malaysia','Mexican Grand Prix':'Mexico'
                         ,'Monaco Grand Prix':'Monaco','Russian Grand Prix':'Russia'
                         ,'San Marino Grand Prix':'SanMarino','Singapore Grand Prix':'Singarope'
                         ,'Spanish Grand Prix':'Spain','Turkish Grand Prix':'Turkey'
                         ,'United States Grand Prix':'USA'
                         },inplace=True)
In [34]:
#box_speed.T.mean()
In [35]:
#Create box plot of average speed in fastest lap

#Add series as an element, use the mean and add some jitter to data points
Italy = go.Box(y=box_speed['Italy'],name='Italy',boxmean=True)
Belgium = go.Box(y=box_speed['Belgium'],name='Belgium',boxmean=True)
UK = go.Box(y=box_speed['UK'],name='UK',boxmean=True)
Austria = go.Box(y=box_speed['Austria'],name='Austria',boxmean=True)
Turkey = go.Box(y=box_speed['Turkey'],name='Turkey',boxmean=True)
Australia = go.Box(y=box_speed['Australia'],name='Australia',boxmean=True)
Japan = go.Box(y=box_speed['Japan'],name='Japan',boxmean=True)
SanMarino = go.Box(y=box_speed['SanMarino'],name='SanMarino',boxmean=True)
Russia = go.Box(y=box_speed['Russia'],name='Russia',boxmean=True)
India = go.Box(y=box_speed['India'],name='India',boxmean=True)

#Establish data to be used in plot
data = [Italy, Belgium, UK, Austria, Turkey, Australia, Japan, SanMarino, Russia, India]

#Set layout configuration, title, xaxis, yaxis, specific range for yvalues
layout = go.Layout(
    title='Average speed in km/h of the Fastest Lap of the winner of the race',
       autosize=False,
        width=850,
        height=450,
    xaxis=dict(
        title='Grand Prix',
        type='category'
    ),
    yaxis=dict(
        title='km/h',
        autorange=True,
        nticks=5
    ),
)

#create plot figure
fig = go.Figure(data=data, layout=layout)

#show plot
plotly.offline.iplot(fig)

print(box_speed.describe().T.sort_values(by=['mean'],ascending=False).iloc[0:10,]['mean'].describe())
count     10.000000
mean     220.778195
std       10.253106
min      209.598667
25%      215.314643
50%      217.603893
75%      224.083661
max      244.591357
Name: mean, dtype: float64

Q4 Which are the most frequent breakdowns per decade, have the cars improved?

In [36]:
df6=pd.read_sql_query("""
                    SELECT races.year, 
                    races.name, 
                    status.statusId, 
                    status.status 
                    FROM status 
                    INNER JOIN results ON status.statusId = results.statusId 
                    INNER JOIN races ON races.raceId = results.raceId 
                    WHERE results.statusId <> 1
                      """, engine)
df6.head()
Out[36]:
year name statusId status
0 2008 Australian Grand Prix 11 +1 Lap
1 2008 Australian Grand Prix 5 Engine
2 2008 Australian Grand Prix 5 Engine
3 2008 Australian Grand Prix 4 Collision
4 2008 Australian Grand Prix 3 Accident
In [37]:
df6['status'] = df6['status'].astype('category')
df6['name'] = df6['name'].astype('category')
In [38]:
df6.dtypes
Out[38]:
year           int64
name        category
statusId       int64
status      category
dtype: object
In [39]:
year_breakdown = df6
In [40]:
del year_breakdown['name']
del year_breakdown['statusId']
In [41]:
year_breakdown = pd.crosstab(year_breakdown.status, year_breakdown.year, margins=False)
In [42]:
year_breakdown = year_breakdown.T
In [43]:
year_breakdown = year_breakdown[year_breakdown.columns.drop(list(year_breakdown.filter(regex='Lap')))]
year_breakdown = year_breakdown[year_breakdown.columns.drop(list(year_breakdown.filter(regex='Laps')))]
In [44]:
year_breakdown = year_breakdown.sort_index(axis=0 ,ascending=False)
year_breakdown = year_breakdown.sort_index(axis=1 ,ascending=False)
In [45]:
breakdown_evolution = pd.DataFrame(year_breakdown.iloc[58:68,].sum(),columns=['50-59'])
In [46]:
breakdown_evolution['60-69'] = year_breakdown.iloc[48:58,].sum()
breakdown_evolution['70-79'] = year_breakdown.iloc[38:48,].sum()
breakdown_evolution['80-89'] = year_breakdown.iloc[28:38,].sum()
breakdown_evolution['90-99'] = year_breakdown.iloc[18:28,].sum()
breakdown_evolution['00-09'] = year_breakdown.iloc[8:18,].sum()
breakdown_evolution['10-17'] = year_breakdown.iloc[0:8,].sum()
In [47]:
breakdown_evolution.head()
Out[47]:
50-59 60-69 70-79 80-89 90-99 00-09 10-17
status
Withdrew 26 124 36 20 11 15 9
Wheel rim 0 0 0 0 0 3 0
Wheel nut 0 0 0 0 0 1 7
Wheel bearing 7 6 4 11 9 0 0
Wheel 10 5 16 15 16 5 14
In [48]:
breakdown_evolution = breakdown_evolution.T
breakdown_pareto = pd.DataFrame(breakdown_evolution.mean().sort_values(ascending=False),columns=['Average'])
breakdown_pareto['%'] = breakdown_evolution.mean().sort_values(ascending=False)/breakdown_evolution.mean().sort_values(ascending=False).sum()
breakdown_pareto['Pareto'] = breakdown_pareto['%'].cumsum()
In [49]:
#breakdown_pareto['Pareto'][0:20]
In [50]:
top20 = breakdown_evolution[['Engine','Did not qualify','Accident','Gearbox','Spun off','Collision',
                    'Suspension','Did not prequalify','Transmission','Electrical','Withdrew','Brakes',
                    'Clutch','Not classified','Fuel system','Turbo','Disqualified','Hydraulics','Ignition'
                    ,'Overheating']].copy()
In [51]:
# Initialize the matplotlib figure
fig, ax = plt.subplots()
fig.set_size_inches(14,7)
# Plot the data
plt.title('Top 20 reasons to not finish a race per decade \n')
ax = sns.heatmap(top20.T,robust=True,linewidths=.5,cmap='Oranges')
plt.show()
print(top20.describe().T.sort_values(by=['mean'],ascending=False)['mean'].head())
print(top20.describe().T.sort_values(by=['mean'],ascending=False)['mean'].tail())
status
Engine             282.857143
Did not qualify    146.428571
Accident           144.571429
Gearbox            112.857143
Spun off           112.571429
Name: mean, dtype: float64
status
Turbo           20.428571
Disqualified    19.571429
Hydraulics      18.571429
Ignition        18.285714
Overheating     18.000000
Name: mean, dtype: float64
In [52]:
year_breakdown = year_breakdown.reindex(index=year_breakdown.index[::-1])
In [53]:
# Initialize the matplotlib figure
fig, ax = plt.subplots()
fig.set_size_inches(14,7)
# Plot the data
plt.title('Most frequent reasons to quit a race per year 08-17')
ax = sns.heatmap(year_breakdown[['Engine','Did not qualify','Accident','Gearbox','Spun off','Collision',
                    'Suspension','Did not prequalify','Transmission','Electrical','Withdrew','Brakes',
                    'Clutch','Not classified','Fuel system','Turbo','Disqualified','Hydraulics','Ignition'
                    ,'Overheating']].iloc[58:68,].T, linewidths=.5,cmap='Oranges',yticklabels=False)
plt.show()

print(year_breakdown[['Engine','Did not qualify','Accident','Gearbox','Spun off','Collision',
                    'Suspension','Did not prequalify','Transmission','Electrical','Withdrew','Brakes',
                    'Clutch','Not classified','Fuel system','Turbo','Disqualified','Hydraulics','Ignition'
                    ,'Overheating']].iloc[58:68,].describe().T.sort_values(by=['mean'],ascending=False)['mean'].head())
status
Collision    18.5
Accident     10.9
Engine        7.4
Gearbox       5.9
Brakes        4.3
Name: mean, dtype: float64

Q5 Which are the teams with most wins?

In [54]:
df8=pd.read_sql_query("""
SELECT races.year, 
races.round,
constructors.name, 
constructorStandings.position
FROM constructors 
INNER JOIN constructorStandings
ON constructors.constructorId = constructorStandings.constructorId 
INNER JOIN races 
ON constructorStandings.raceId = races.raceId  
                      """, engine)

#
df8.head()
Out[54]:
year round name position
0 2008 1 McLaren 1
1 2008 1 BMW Sauber 3
2 2008 1 Williams 2
3 2008 1 Renault 4
4 2008 1 Toro Rosso 5
In [55]:
df8['name'] = df8['name'].astype('category')
df8['round'] = df8['round'].astype('category')
In [56]:
winners = df8.copy()
In [57]:
winners.set_index(['year'],inplace=True)
In [58]:
winners = pd.crosstab(winners.name,winners.position, margins=False)
In [59]:
top_winners = pd.DataFrame(winners[1])
In [60]:
top_winners.rename(columns={'[1]':'Wins',
                         },inplace=True)
In [61]:
top_winners = top_winners.transpose()
In [62]:
top_winners.rename(index={1:'numofwins'}, inplace=True)
In [63]:
top_winners = top_winners.T.sort_values("numofwins", ascending=False)
In [64]:
top_winners = top_winners[(top_winners.numofwins > 0)]
In [65]:
top_winners.head()
Out[65]:
numofwins
name
Ferrari 224
McLaren 165
Williams 114
Mercedes 75
Red Bull 63
In [66]:
sns.set(style="whitegrid")
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(12, 6))

# Plot the data
sns.barplot(palette="Reds_r",
            data=top_winners.T,
            orient='h',
            label="Total")

# Add a legend and informative axis label
plt.title('Number of 1st place podiums in all the years \n')
ax.set(ylabel="Teams",
       xlabel="Total Amount of wins")
sns.despine(left=True, bottom=True)

plt.show()

print(top_winners.head())
          numofwins
name               
Ferrari         224
McLaren         165
Williams        114
Mercedes         75
Red Bull         63
In [67]:
wins = np.array([224,165,114])
runs = np.array([951,825,685])
In [68]:
wins/runs
Out[68]:
array([ 0.23554154,  0.2       ,  0.16642336])

Q6 Which country produces the most drivers?

In [69]:
df9=pd.read_sql_query("""
SELECT *
FROM drivers
                      """, engine)
df9.head()
Out[69]:
driverId driverRef number code forename surname dob nationality url
0 1 hamilton 44.0 HAM Lewis Hamilton 1985-01-07 British http://en.wikipedia.org/wiki/Lewis_Hamilton
1 2 heidfeld NaN HEI Nick Heidfeld 1977-05-10 German http://en.wikipedia.org/wiki/Nick_Heidfeld
2 3 rosberg 6.0 ROS Nico Rosberg 1985-06-27 German http://en.wikipedia.org/wiki/Nico_Rosberg
3 4 alonso 14.0 ALO Fernando Alonso 1981-07-29 Spanish http://en.wikipedia.org/wiki/Fernando_Alonso
4 5 kovalainen NaN KOV Heikki Kovalainen 1981-10-19 Finnish http://en.wikipedia.org/wiki/Heikki_Kovalainen
In [70]:
df9.shape
Out[70]:
(842, 9)
In [71]:
drivers_per_contry = df9.drop(columns=['driverRef', 'number','code','forename','surname','url','dob','driverId'])
In [72]:
drivers_per_contry.head()
Out[72]:
nationality
0 British
1 German
2 German
3 Spanish
4 Finnish
In [73]:
drivers_per_contry['nationality'] = drivers_per_contry['nationality'].astype('category')
In [74]:
drivers_per_contry = drivers_per_contry.apply(pd.value_counts)
In [75]:
code = ['GBR', 'USA', 'ITA', 'FRA', 'DEU',
                  'BRA', 'ARG', 'ZAF', 'BEL',
                  'CHE', 'JPN', 'AUS', 'NLD', 'ESP',
                  'AUT', 'CAN', 'SWE', 'FIN',
                  'NZL', 'MEX', 'IRL', 'DNK', 'URY',
                  'PRT', 'RHA', 'COL', 'VEN',
                  'DEU', 'MCO', 'RUS', 'IND', 'CZE',
                  'HUN', 'CHL', 'IDN', 'LIE',
                  'MYS', 'POL', 'ARG', 'THA',
                  'USA']
In [76]:
code = pd.Series(code)
drivers_per_contry['code'] = code.values
In [77]:
drv_per_contry = drivers_per_contry.drop(['Argentine-Italian', 'American-Italian','East German','Rhodesian'])
In [78]:
drv_per_contry['nation'] = drv_per_contry.index
In [79]:
drv_per_contry_pct = pd.DataFrame(drv_per_contry['nationality']/drv_per_contry['nationality'].sum().copy())
In [142]:
#Create choropleth world map 

#Establish data to be used in plot
data = dict(
        type = 'choropleth',
        locations = drv_per_contry['code'],
        z = drv_per_contry['nationality'].astype(float),
        text = drv_per_contry['nation'],
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(220,220,220)',
                width = 0.5
            ) ),
        colorbar = dict(
            title = 'Number of Drivers'),
      )
data = go.Data([data])

#Set layout configuration, title, type of map
layout = dict(
    title = 'Formula 1 Drivers per Country',
    
    geo = dict(
        showframe = True,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)
layout = go.Layout(layout)

#create plot figure
fig = go.Figure(data=data, layout=layout)

#show plot
plotly.offline.iplot(fig)

#Create pie chart

#Establish data to be used in plot
labels = ['British','American','Italian','Others']
values = [drv_per_contry['nationality'].iloc[0],
          drv_per_contry['nationality'].iloc[1],
          drv_per_contry['nationality'].iloc[2],
#          drv_per_contry['nationality'].iloc[3],
          (drv_per_contry['nationality'].sum()-drv_per_contry['nationality'].iloc[0:3,].sum())]
colors = ['#FF754E', '#D3A0E8', '#5AA0FF', '#66E896','#FFFB4E']

trace = go.Pie(labels=labels, 
               values=values,
               hoverinfo='label+percent', 
               textinfo='label+percent', 
               textfont=dict(size=16),
               marker=dict(colors=colors,
                           line=dict(color='#000000', width=1)))
#show plot
plotly.offline.iplot([trace])

Champions per country

In [81]:
import html5lib
In [82]:
driver_standings = pd.DataFrame()
for i in range (1950,2017):
    new = pd.read_html('https://www.formula1.com/en/results.html/'+str(i)+'/drivers.html')[0]
    new['year']= str(i)
    driver_standings = driver_standings.append(new,ignore_index=True)

driver_standings = driver_standings.drop(columns=['Unnamed: 0', 'Unnamed: 6'])
In [83]:
driver_standings.set_index(['year'],inplace=True)
In [84]:
driver_standings['Nationality'] = driver_standings['Nationality'].astype('category')
driver_standings['Car'] = driver_standings['Car'].astype('category')
driver_standings['Driver'] = driver_standings['Driver'].astype('category')
driver_standings['Pos'] = pd.to_numeric(driver_standings['Pos'],errors="coerce")
In [85]:
winners_per_country = driver_standings.loc[driver_standings['Pos'] == 1].copy()
In [86]:
winners_per_country.dtypes
Out[86]:
Pos             float64
Driver         category
Nationality    category
Car            category
Pts             float64
dtype: object
In [140]:
#Create pie chart

#Establish data to be used in plot
labels = ['British','German','Brazilian','Others']
values = [(winners_per_country['Nationality'].value_counts().iloc[0]),
          (winners_per_country['Nationality'].value_counts().iloc[1]),
          (winners_per_country['Nationality'].value_counts().iloc[2]),
#          drv_per_contry['nationality'].iloc[3],
          (winners_per_country['Nationality'].value_counts().sum())-(winners_per_country['Nationality'].value_counts().iloc[0:3].sum())]
colors = ['#FF754E', '#D3A0E8', '#5AA0FF', '#66E896','#FFFB4E']

trace = go.Pie(labels=labels, 
               values=values,
               hoverinfo='label+percent', 
               textinfo='label+percent', 
               textfont=dict(size=16),
               marker=dict(colors=colors,
                           line=dict(color='#000000', width=1)))
#show plot
plotly.offline.iplot([trace])